In [1]:
#Airbnb ML challenge
import numpy as np 
import pandas as pd 


import os
for dirname, _, filenames in os.walk('data/'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
data/calendar.csv
data/listings.csv
data/reviews.csv
In [2]:
calendar_data_path=os.path.join(dirname, 'calendar.csv')
listings_data_path=os.path.join(dirname, 'listings.csv')
reviews_data_path=os.path.join(dirname, 'reviews.csv')
In [3]:
calendar=pd.read_csv(calendar_data_path)
listings=pd.read_csv(listings_data_path)
reviews=pd.read_csv(reviews_data_path)
In [4]:
listings.head(2)
Out[4]:
id listing_url scrape_id last_scraped name summary space description experiences_offered neighborhood_overview ... review_scores_value requires_license license jurisdiction_names instant_bookable cancellation_policy require_guest_profile_picture require_guest_phone_verification calculated_host_listings_count reviews_per_month
0 241032 https://www.airbnb.com/rooms/241032 20160104002432 2016-01-04 Stylish Queen Anne Apartment NaN Make your self at home in this charming one-be... Make your self at home in this charming one-be... none NaN ... 10.0 f NaN WASHINGTON f moderate f f 2 4.07
1 953595 https://www.airbnb.com/rooms/953595 20160104002432 2016-01-04 Bright & Airy Queen Anne Apartment Chemically sensitive? We've removed the irrita... Beautiful, hypoallergenic apartment in an extr... Chemically sensitive? We've removed the irrita... none Queen Anne is a wonderful, truly functional vi... ... 10.0 f NaN WASHINGTON f strict t t 6 1.48

2 rows × 92 columns

In [5]:
listings_df=listings[['price','accommodates','bathrooms','bedrooms','beds','bed_type','room_type','property_type']]
#listings_df=listings[['id','host_id','reviews_per_month','price','accommodates','bathrooms','bedrooms','beds','bed_type','room_type','property_type']]
In [6]:
listings_df.head()
Out[6]:
price accommodates bathrooms bedrooms beds bed_type room_type property_type
0 $85.00 4 1.0 1.0 1.0 Real Bed Entire home/apt Apartment
1 $150.00 4 1.0 1.0 1.0 Real Bed Entire home/apt Apartment
2 $975.00 11 4.5 5.0 7.0 Real Bed Entire home/apt House
3 $100.00 3 1.0 0.0 2.0 Real Bed Entire home/apt Apartment
4 $450.00 6 2.0 3.0 3.0 Real Bed Entire home/apt House
In [7]:
listings_df_cleaned=listings_df.dropna()
In [8]:
#Percent of rows dropped after cleaning
((len(listings_df)-len(listings_df_cleaned))/len(listings_df))*100
Out[8]:
0.6024096385542169
In [9]:
listings_df_cleaned.head()
Out[9]:
price accommodates bathrooms bedrooms beds bed_type room_type property_type
0 $85.00 4 1.0 1.0 1.0 Real Bed Entire home/apt Apartment
1 $150.00 4 1.0 1.0 1.0 Real Bed Entire home/apt Apartment
2 $975.00 11 4.5 5.0 7.0 Real Bed Entire home/apt House
3 $100.00 3 1.0 0.0 2.0 Real Bed Entire home/apt Apartment
4 $450.00 6 2.0 3.0 3.0 Real Bed Entire home/apt House
In [10]:
listings_df_cleaned['price']=listings_df_cleaned['price'].str.replace("[$, ]", "").astype("float")
D:\InstalledTools\AnacondaInstaller\lib\site-packages\ipykernel_launcher.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
In [11]:
listings_df_cleaned.head()
Out[11]:
price accommodates bathrooms bedrooms beds bed_type room_type property_type
0 85.0 4 1.0 1.0 1.0 Real Bed Entire home/apt Apartment
1 150.0 4 1.0 1.0 1.0 Real Bed Entire home/apt Apartment
2 975.0 11 4.5 5.0 7.0 Real Bed Entire home/apt House
3 100.0 3 1.0 0.0 2.0 Real Bed Entire home/apt Apartment
4 450.0 6 2.0 3.0 3.0 Real Bed Entire home/apt House
In [12]:
listings_df_cleaned.bed_type.unique()
Out[12]:
array(['Real Bed', 'Futon', 'Pull-out Sofa', 'Airbed', 'Couch'],
      dtype=object)
In [13]:
listings_df_cleaned.room_type.unique()
Out[13]:
array(['Entire home/apt', 'Private room', 'Shared room'], dtype=object)
In [14]:
listings_df_cleaned.property_type.unique()
Out[14]:
array(['Apartment', 'House', 'Cabin', 'Condominium', 'Camper/RV',
       'Bungalow', 'Townhouse', 'Loft', 'Boat', 'Bed & Breakfast',
       'Other', 'Dorm', 'Treehouse', 'Yurt', 'Chalet', 'Tent'],
      dtype=object)
In [15]:
import plotly.express as px
In [16]:
fig =  px.scatter(listings_df_cleaned, x="accommodates", y="price",title="Accommodates vs Price")
fig.show()
In [17]:
fig =  px.scatter(listings_df_cleaned, x="bedrooms", y="price",title="Bedrooms vs Price")
fig.show()
In [18]:
fig =  px.scatter(listings_df_cleaned, x="bathrooms", y="price",title="Bathrooms vs Price")
fig.show()
In [19]:
fig =  px.scatter(listings_df_cleaned, x="beds", y="price",title="beds vs Price")
fig.show()
In [20]:
fig =  px.scatter(listings_df_cleaned, x="bed_type", y="price",title="Bed Type vs Price")
fig.show()
In [21]:
fig =  px.scatter(listings_df_cleaned, x="room_type", y="price",title="Room Type vs Price")
fig.show()
In [22]:
fig =  px.scatter(listings_df_cleaned, x="property_type", y="price",title="Property Type vs Price")
fig.show()
In [23]:
#One hot encoding of some of the categorical non-numerical data
listings_df_cleaned = pd.concat((listings_df_cleaned,pd.get_dummies(listings_df_cleaned.room_type,prefix='room-type')),1)
listings_df_cleaned = pd.concat((listings_df_cleaned,pd.get_dummies(listings_df_cleaned.bed_type,prefix='bed_type')),1)
listings_df_cleaned = pd.concat((listings_df_cleaned,pd.get_dummies(listings_df_cleaned.property_type,prefix='property_type')),1)
In [24]:
#Remove One Hot Encoded columns
listings_df_cleaned=listings_df_cleaned.drop(columns=['bed_type','room_type','property_type'])
In [25]:
listings_df_cleaned.head()
Out[25]:
price accommodates bathrooms bedrooms beds room-type_Entire home/apt room-type_Private room room-type_Shared room bed_type_Airbed bed_type_Couch ... property_type_Chalet property_type_Condominium property_type_Dorm property_type_House property_type_Loft property_type_Other property_type_Tent property_type_Townhouse property_type_Treehouse property_type_Yurt
0 85.0 4 1.0 1.0 1.0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 150.0 4 1.0 1.0 1.0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 975.0 11 4.5 5.0 7.0 1 0 0 0 0 ... 0 0 0 1 0 0 0 0 0 0
3 100.0 3 1.0 0.0 2.0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 450.0 6 2.0 3.0 3.0 1 0 0 0 0 ... 0 0 0 1 0 0 0 0 0 0

5 rows × 29 columns

In [26]:
from sklearn import linear_model
In [27]:
listings_df_cleaned.loc[:, listings_df_cleaned.columns != 'price']
Out[27]:
accommodates bathrooms bedrooms beds room-type_Entire home/apt room-type_Private room room-type_Shared room bed_type_Airbed bed_type_Couch bed_type_Futon ... property_type_Chalet property_type_Condominium property_type_Dorm property_type_House property_type_Loft property_type_Other property_type_Tent property_type_Townhouse property_type_Treehouse property_type_Yurt
0 4 1.0 1.0 1.0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 4 1.0 1.0 1.0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 11 4.5 5.0 7.0 1 0 0 0 0 0 ... 0 0 0 1 0 0 0 0 0 0
3 3 1.0 0.0 2.0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 6 2.0 3.0 3.0 1 0 0 0 0 0 ... 0 0 0 1 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
3813 6 2.0 3.0 3.0 1 0 0 0 0 0 ... 0 0 0 1 0 0 0 0 0 0
3814 4 1.0 1.0 2.0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3815 2 1.0 1.0 1.0 1 0 0 0 0 0 ... 0 0 0 1 0 0 0 0 0 0
3816 2 1.0 0.0 1.0 1 0 0 0 0 0 ... 0 1 0 0 0 0 0 0 0 0
3817 3 1.5 2.0 1.0 1 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

3795 rows × 28 columns

In [28]:
reg = linear_model.LinearRegression()
reg.fit(listings_df_cleaned.loc[:, listings_df_cleaned.columns != 'price'],listings_df_cleaned.price)
#reg.fit(listings_df_cleaned[['accommodates','bathrooms','bedrooms','beds']],listings_df_cleaned.price)
Out[28]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
In [ ]: